# map the lat lon to US states
# library(maps)
library(mapdata)
## Loading required package: maps
library(sp)
library(raster)
##
## Attaching package: 'raster'
## The following object is masked from 'package:skimr':
##
## bind
## The following object is masked from 'package:plotly':
##
## select
library(rworldmap)
## ### Welcome to rworldmap ###
## For a short introduction type : vignette('rworldmap')
dfWithIndicesCache = "../output/data.with.indices.Rdata"
if (!file.exists(dfWithIndicesCache)) {
df <- fread("../output/50.2.umap.tsv.gz")
df[, c("lat", "lon") := NULL]
raw <- fread("../output/raw.tsv.gz")
df = cbind(df, raw[, c("lat", "lon")])
countriesSP <- getMap(resolution = 'low')
pointsSP = SpatialPoints(df[, c("lon", "lat")], proj4string = CRS(proj4string(countriesSP)))
indices = over(pointsSP, countriesSP)
usaSP = raster::getData("GADM", country = "USA", level = 1)
usaSP = spTransform(usaSP, CRS(proj4string(countriesSP)))
indicesUSA = over(pointsSP, usaSP)
df = cbind(df, indices)
df = cbind(df, indicesUSA)
save(df, file = dfWithIndicesCache)
}
load(dfWithIndicesCache)
metricTypes <- c("euclidean", "haversine")
# https://umap-learn.readthedocs.io/en/latest/embedding_space.html
# transformations to 2d
# x = np.sin(sphere_mapper.embedding_[:, 0]) * np.cos(sphere_mapper.embedding_[:, 1])
# y = np.sin(sphere_mapper.embedding_[:, 0]) * np.sin(sphere_mapper.embedding_[:, 1])
# z = np.cos(sphere_mapper.embedding_[:, 0])
#
# x = np.arctan2(x, y)
# y = -np.arccos(z)
skim(df)
| Name | df |
| Number of rows | 4857144 |
| Number of columns | 100 |
| Key | NULL |
| _______________________ | |
| Column type frequency: | |
| character | 17 |
| factor | 36 |
| numeric | 45 |
| POSIXct | 2 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Activity | 0 | 1.00 | 4 | 10 | 0 | 7 | 0 |
| Name | 0 | 1.00 | 2 | 3 | 0 | 2 | 0 |
| Notes | 0 | 1.00 | 0 | 8 | 4184952 | 15 | 0 |
| UUID | 0 | 1.00 | 16 | 36 | 0 | 63 | 0 |
| Version | 0 | 1.00 | 0 | 28 | 820090 | 30 | 0 |
| Visit | 0 | 1.00 | 0 | 467 | 4854672 | 2473 | 0 |
| imgS3 | 0 | 1.00 | 0 | 50 | 4857004 | 141 | 0 |
| GID_0 | 38323 | 0.99 | 3 | 3 | 0 | 1 | 0 |
| NAME_0 | 38323 | 0.99 | 13 | 13 | 0 | 1 | 0 |
| GID_1 | 38323 | 0.99 | 7 | 8 | 0 | 42 | 0 |
| NAME_1 | 38323 | 0.99 | 4 | 14 | 0 | 42 | 0 |
| VARNAME_1 | 38323 | 0.99 | 2 | 56 | 0 | 42 | 0 |
| NL_NAME_1 | 4857144 | 0.00 | NA | NA | 0 | 0 | 0 |
| TYPE_1 | 38323 | 0.99 | 5 | 5 | 0 | 1 | 0 |
| ENGTYPE_1 | 38323 | 0.99 | 5 | 5 | 0 | 1 | 0 |
| CC_1 | 4857144 | 0.00 | NA | NA | 0 | 0 | 0 |
| HASC_1 | 38323 | 0.99 | 5 | 5 | 0 | 42 | 0 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| FeatureCla | 83980 | 0.98 | FALSE | 1 | Adm: 4773164, Adm: 0 |
| SOVEREIGNT | 83980 | 0.98 | FALSE | 12 | Uni: 4749730, Cze: 6763, Can: 4769, Pol: 4028 |
| SOV_A3 | 83980 | 0.98 | FALSE | 12 | US1: 4749730, CZE: 6763, CAN: 4769, POL: 4028 |
| TYPE | 83980 | 0.98 | FALSE | 2 | Cou: 4749774, Sov: 23390, Cou: 0, Dep: 0 |
| ADMIN | 83980 | 0.98 | FALSE | 12 | Uni: 4749730, Cze: 6763, Can: 4769, Pol: 4028 |
| ADM0_A3 | 83980 | 0.98 | FALSE | 12 | USA: 4749730, CZE: 6763, CAN: 4769, POL: 4028 |
| GEOUNIT | 83980 | 0.98 | FALSE | 12 | Uni: 4749730, Cze: 6763, Can: 4769, Pol: 4028 |
| GU_A3 | 83980 | 0.98 | FALSE | 12 | USA: 4749730, CZE: 6763, CAN: 4769, POL: 4028 |
| SUBUNIT | 83980 | 0.98 | FALSE | 12 | Uni: 4749730, Cze: 6763, Can: 4769, Pol: 4028 |
| SU_A3 | 83980 | 0.98 | FALSE | 12 | USA: 4749730, CZE: 6763, CAN: 4769, POL: 4028 |
| NAME | 83980 | 0.98 | FALSE | 12 | Uni: 4749730, Cze: 6763, Can: 4769, Pol: 4028 |
| ABBREV | 83980 | 0.98 | FALSE | 12 | U.S: 4749730, Cz.: 6763, Can: 4769, Pol: 4028 |
| POSTAL | 83980 | 0.98 | FALSE | 12 | US: 4749730, CZ: 6763, CA: 4769, PL: 4028 |
| NAME_FORMA | 90912 | 0.98 | FALSE | 10 | Uni: 4749730, Cze: 6763, Rep: 4028, Rep: 3272 |
| TERR_ | 4857144 | 0.00 | FALSE | 0 | Ass: 0, Auz: 0, Chi: 0, Com: 0 |
| NAME_SORT | 83980 | 0.98 | FALSE | 12 | Uni: 4749730, Cze: 6763, Can: 4769, Pol: 4028 |
| FIPS_10_ | 4857144 | 0.00 | FALSE | 0 | FG: 0 |
| ISO_A2 | 83980 | 0.98 | FALSE | 12 | US: 4749730, CZ: 6763, CA: 4769, PL: 4028 |
| ISO_A3 | 83980 | 0.98 | FALSE | 12 | USA: 4749730, CZE: 6763, CAN: 4769, POL: 4028 |
| ISO3 | 83980 | 0.98 | FALSE | 12 | USA: 4749730, CZE: 6763, CAN: 4769, POL: 4028 |
| ISO3.1 | 83980 | 0.98 | FALSE | 12 | USA: 4749730, CZE: 6763, CAN: 4769, POL: 4028 |
| ADMIN.1 | 83980 | 0.98 | FALSE | 12 | Uni: 4749730, Cze: 6763, Can: 4769, Pol: 4028 |
| REGION | 83980 | 0.98 | FALSE | 3 | Nor: 4754499, Eur: 15385, Sou: 3280, Afr: 0 |
| continent | 83980 | 0.98 | FALSE | 3 | Nor: 4754499, Eur: 15385, Sou: 3280, Afr: 0 |
| GEO3major | 83980 | 0.98 | FALSE | 3 | Nor: 4754499, Eur: 15385, Lat: 3280, Afr: 0 |
| GEO3 | 83980 | 0.98 | FALSE | 6 | US: 4749730, Cen: 12026, Can: 4769, Sou: 3280 |
| IMAGE24 | 83980 | 0.98 | FALSE | 7 | USA: 4749730, Cen: 12026, Can: 4769, Res: 3272 |
| GLOCAF | 83980 | 0.98 | FALSE | 6 | USA: 4749730, Eur: 13222, Can: 4769, Res: 3272 |
| Stern | 83980 | 0.98 | FALSE | 3 | Nor: 4754499, Eur: 15385, Sou: 3280, Aus: 0 |
| SRESmajor | 83980 | 0.98 | FALSE | 3 | OEC: 4755695, REF: 14189, ALM: 3280, ASI: 0 |
| SRES | 83980 | 0.98 | FALSE | 5 | Nor: 4754499, Cen: 12026, Lat: 3280, New: 2163 |
| GBD | 83980 | 0.98 | FALSE | 6 | Nor: 4754499, Eur: 12026, Lat: 3272, Eur: 2163 |
| AVOIDname | 83980 | 0.98 | FALSE | 8 | US: 4749730, Eur: 11301, Can: 4769, Pol: 4028 |
| LDC | 83980 | 0.98 | FALSE | 1 | oth: 4773164, LDC: 0 |
| SID | 83980 | 0.98 | FALSE | 1 | oth: 4773164, SID: 0 |
| LLDC | 83980 | 0.98 | FALSE | 1 | oth: 4773164, LLD: 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Accuracy | 0 | 1.00 | 6.04 | 1.59 | -1.00 | 4.74 | 6.00 | 8.00 | 1.000000e+01 | ▁▁▇▃▅ |
| Elevation | 0 | 1.00 | 309.11 | 308.89 | -67.07 | 146.18 | 250.75 | 349.22 | 1.124989e+04 | ▇▁▁▁▁ |
| Heading | 0 | 1.00 | 164.34 | 109.76 | -1.00 | 70.66 | 165.92 | 259.03 | 3.600000e+02 | ▇▆▇▆▆ |
| Pressure | 0 | 1.00 | 144.33 | 202.58 | 0.00 | 97.40 | 98.87 | 99.85 | 1.027210e+03 | ▇▁▁▁▁ |
| Speed | 0 | 1.00 | 3.16 | 7.60 | -1.00 | 0.00 | 0.00 | 1.39 | 4.989000e+01 | ▇▁▁▁▁ |
| UnixTime | 0 | 1.00 | 1615763853.19 | 60327287.52 | 1525623850.00 | 1557692918.50 | 1622138392.50 | 1674732350.50 | 1.704206e+09 | ▇▅▃▅▇ |
| HeartRate | 4842796 | 0.00 | 88.11 | 28.44 | -1.00 | 76.00 | 85.00 | 100.00 | 1.650000e+02 | ▁▁▇▂▁ |
| Distance | 4810634 | 0.01 | 68964.98 | 174017.51 | 2.00 | 5703.61 | 14004.90 | 45458.82 | 1.060107e+06 | ▇▁▁▁▁ |
| NumberOfSteps | 4808496 | 0.01 | 123151.68 | 269954.22 | 4.00 | 7952.00 | 15226.00 | 67482.00 | 1.153645e+06 | ▇▁▁▁▁ |
| AverageActivePace | 4822571 | 0.01 | 1.05 | 0.70 | 0.36 | 0.73 | 0.93 | 1.07 | 1.509000e+01 | ▇▁▁▁▁ |
| CurrentCadence | 4822631 | 0.01 | 1.67 | 0.32 | 0.99 | 1.48 | 1.66 | 1.78 | 2.710000e+00 | ▃▇▇▂▁ |
| CurrentPace | 4822631 | 0.01 | 0.87 | 0.20 | 0.26 | 0.79 | 0.87 | 1.02 | 1.300000e+00 | ▁▂▇▆▂ |
| FloorsAscended | 4824429 | 0.01 | 14.84 | 14.30 | 1.00 | 3.00 | 10.00 | 16.00 | 5.800000e+01 | ▇▆▁▁▁ |
| FloorsDescended | 4824118 | 0.01 | 11.68 | 10.49 | 1.00 | 5.00 | 11.00 | 15.00 | 4.100000e+01 | ▇▆▁▁▁ |
| vAccuracy | 4844573 | 0.00 | 4.54 | 3.83 | 0.80 | 1.60 | 3.00 | 6.40 | 3.740000e+01 | ▇▂▁▁▁ |
| AccelerometerX | 4850479 | 0.00 | -0.82 | 4.54 | -13.76 | -4.08 | -0.42 | 1.20 | 9.210000e+00 | ▁▆▇▇▅ |
| AccelerometerY | 4850479 | 0.00 | -0.45 | 3.44 | -22.32 | -0.97 | -0.44 | 0.57 | 1.690000e+01 | ▁▁▇▂▁ |
| AccelerometerZ | 4850479 | 0.00 | 3.84 | 7.15 | -14.43 | -1.25 | 6.99 | 9.78 | 1.337000e+01 | ▁▃▁▅▇ |
| ActivityConfidence | 4850479 | 0.00 | 100.00 | 0.00 | 100.00 | 100.00 | 100.00 | 100.00 | 1.000000e+02 | ▁▁▇▁▁ |
| GyroscopeX | 4850479 | 0.00 | 0.00 | 0.30 | -3.86 | -0.01 | 0.00 | 0.01 | 4.170000e+00 | ▁▁▇▁▁ |
| GyroscopeY | 4850479 | 0.00 | 0.01 | 0.23 | -2.83 | -0.01 | 0.00 | 0.01 | 3.860000e+00 | ▁▁▇▁▁ |
| GyroscopeZ | 4850479 | 0.00 | 0.00 | 0.14 | -2.71 | -0.01 | 0.00 | 0.01 | 1.990000e+00 | ▁▁▇▁▁ |
| UserAccelerometerX | 4850479 | 0.00 | 0.00 | 0.76 | -12.67 | -0.15 | 0.00 | 0.13 | 8.760000e+00 | ▁▁▇▂▁ |
| UserAccelerometerY | 4850479 | 0.00 | -0.04 | 0.77 | -13.89 | -0.08 | 0.00 | 0.07 | 1.166000e+01 | ▁▁▇▁▁ |
| UserAccelerometerZ | 4850479 | 0.00 | 0.02 | 0.80 | -16.13 | -0.13 | 0.05 | 0.19 | 8.570000e+00 | ▁▁▁▇▁ |
| Lightmeter | 4855139 | 0.00 | 195.51 | 308.98 | 1.00 | 17.00 | 78.00 | 147.00 | 1.067000e+03 | ▇▁▁▁▁ |
| umap_euclidean0 | 0 | 1.00 | 4.47 | 14.10 | -33.03 | -3.59 | 4.47 | 12.56 | 4.191000e+01 | ▁▃▇▃▁ |
| umap_euclidean1 | 0 | 1.00 | 1.30 | 14.15 | -36.53 | -6.82 | 1.25 | 9.36 | 3.881000e+01 | ▁▃▇▃▁ |
| umap_haversine0 | 0 | 1.00 | 4.36 | 120.81 | -855.47 | -54.06 | 4.53 | 62.14 | 8.866400e+02 | ▁▁▇▁▁ |
| umap_haversine1 | 0 | 1.00 | 1.17 | 79.73 | -568.70 | -37.15 | 1.20 | 39.54 | 5.575500e+02 | ▁▁▇▁▁ |
| lat | 0 | 1.00 | 42.53 | 4.99 | -22.90 | 38.65 | 44.97 | 46.81 | 5.965000e+01 | ▁▁▁▆▇ |
| lon | 0 | 1.00 | -93.57 | 11.62 | -158.23 | -93.26 | -92.08 | -90.28 | 3.090000e+01 | ▁▇▁▁▁ |
| ScaleRank | 83980 | 0.98 | 1.00 | 0.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.000000e+00 | ▁▁▇▁▁ |
| LabelRank | 83980 | 0.98 | 1.00 | 0.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.000000e+00 | ▁▁▇▁▁ |
| ADM0_DIF | 83980 | 0.98 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.000000e+00 | ▁▁▇▁▁ |
| LEVEL | 83980 | 0.98 | 2.00 | 0.00 | 2.00 | 2.00 | 2.00 | 2.00 | 2.000000e+00 | ▁▁▇▁▁ |
| GEOU_DIF | 83980 | 0.98 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.000000e+00 | ▁▁▇▁▁ |
| SU_DIF | 83980 | 0.98 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.000000e+00 | ▁▁▇▁▁ |
| MAP_COLOR | 83980 | 0.98 | 1.01 | 0.21 | 1.00 | 1.00 | 1.00 | 1.00 | 1.100000e+01 | ▇▁▁▁▁ |
| POP_EST | 83980 | 0.98 | 305840704.97 | 19555866.99 | 4489409.00 | 307212123.00 | 307212123.00 | 307212123.00 | 3.072121e+08 | ▁▁▁▁▇ |
| GDP_MD_EST | 83980 | 0.98 | 14192787.08 | 957328.31 | 82390.00 | 14260000.00 | 14260000.00 | 14260000.00 | 1.426000e+07 | ▁▁▁▁▇ |
| ISO_N3 | 83980 | 0.98 | 837.51 | 39.40 | 40.00 | 840.00 | 840.00 | 840.00 | 8.400000e+02 | ▁▁▁▁▇ |
| LON | 83980 | 0.98 | -98.75 | 6.73 | -102.37 | -99.14 | -99.14 | -99.14 | 3.138000e+01 | ▇▁▁▁▁ |
| LAT | 83980 | 0.98 | 39.56 | 1.29 | -10.84 | 39.53 | 39.53 | 39.53 | 6.284000e+01 | ▁▁▁▇▁ |
| AVOIDnumeric | 83980 | 0.98 | 2.09 | 1.37 | 2.00 | 2.00 | 2.00 | 2.00 | 2.600000e+01 | ▇▁▁▁▁ |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| Time | 0 | 1.00 | 2018-05-06 16:24:10 | 2024-01-02 14:31:47 | 2021-05-27 17:59:52 | 4853226 |
| CurrentTripStart | 4808480 | 0.01 | 2023-12-15 18:59:33 | 2023-12-31 18:01:01 | 2023-12-27 18:38:42 | 11 |